Solution from Kaggle Titanic
Data inofrmation:
In [51]:
#import libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Get data
train = pd.read_csv('TRAIN.csv')
test = pd.read_csv('TEST.csv')
In [3]:
# First 5 rows
train.head()
Out[3]:
In [4]:
train.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
test.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
train.head()
Out[4]:
In [5]:
one_hot_train = pd.get_dummies(train)
one_hot_test = pd.get_dummies(test)
# First five rows from train dataset
one_hot_train.head()
Out[5]:
In [6]:
# First five rows from test dataset
one_hot_test.head()
Out[6]:
In [7]:
# Visualize the null values (train)
one_hot_train.isnull().sum().sort_values(ascending=False)
Out[7]:
In [8]:
# Fill the null Age values with the mean of all ages
one_hot_train['Age'].fillna(one_hot_train['Age'].mean(), inplace=True)
one_hot_test['Age'].fillna(one_hot_test['Age'].mean(), inplace=True)
one_hot_train.isnull().sum()
Out[8]:
In [9]:
# Visualize the null values (test)
one_hot_test.isnull().sum().sort_values(ascending=False)
Out[9]:
In [17]:
# Fill the null Fare values with the mean of all Fares
one_hot_test['Fare'].fillna(one_hot_test['Fare'].mean(), inplace=True)
one_hot_test.isnull().sum().sort_values(ascending=False)
Out[17]:
In [60]:
# Creating the feature and the target
feature = one_hot_train.drop('Survived', axis=1)
target = one_hot_train['Survived']
# Model creation
rf = RandomForestClassifier(random_state=1, criterion='gini', max_depth=10, n_estimators=50, n_jobs=-1)
rf.fit(feature, target)
Out[60]:
In [61]:
# Verifying score
rf.score(feature,target)
Out[61]:
In [62]:
# Generate a DataFrame with Padas with 'PassengerId' and 'Survived' colunms
submission = pd.DataFrame()
submission['PassengerId'] = one_hot_test['PassengerId']
submission['Survived'] = rf.predict(one_hot_test)
# Generate the CSV file with 'to_csv' from Pandas
submission.to_csv('submission.csv', index=False)
In [ ]: